Load packages:

library(tidyverse)
library(labelled)

Resources:

1 Generate random distributions

num_obs <- 10000

# Generate standard normal distribution (default is mean = 0 and sd = 1)
set.seed(124)
stdnorm_dist <- rnorm(num_obs, mean = 0, sd = 1)  # equivalent to rnorm(10)

# Generate normal distribution w/ custom mean and sd
set.seed(124)
norm_dist <- rnorm(num_obs, mean = 50, sd = 5)

# Generate left-skewed distribution
set.seed(124)
lskew_dist <- rbeta(num_obs, 5, 2)

# Generate right-skewed distribution
set.seed(124)
rskew_dist <- rbeta(num_obs, 2, 5)

# Create dataframe
df_generated_pop <- data.frame(stdnorm_dist, norm_dist, lskew_dist, rskew_dist)
df_ipeds_pop %>% head()
## # A tibble: 6 x 38
##   instnm                              unitid opeid6 opeid                       control                                               c15basic stabbr       city       zip                    locale                                            region tuit_grad_res fee_grad_res tuit_grad_nres fee_grad_nres tuit_md_res fee_md_res tuit_md_nres fee_md_nres tuit_law_res fee_law_res tuit_law_nres fee_law_nres books_supplies roomboard_off oth_expense_off tuitfee_grad_res tuitfee_grad_nres tuitfee_md_res tuitfee_md_nres tuitfee_law_res tuitfee_law_nres coa_grad_res coa_grad_nres coa_md_res coa_md_nres coa_law_res coa_law_nres
##   <chr>                                <dbl> <chr>  <chr>                     <dbl+lbl>                                              <dbl+lbl> <chr+lbl>    <chr>      <chr>               <dbl+lbl>                                         <dbl+lbl>         <dbl>        <dbl>          <dbl>         <dbl>       <dbl>      <dbl>        <dbl>       <dbl>        <dbl>       <dbl>         <dbl>        <dbl>          <dbl>         <dbl>           <dbl>            <dbl>             <dbl>          <dbl>           <dbl>           <dbl>            <dbl>        <dbl>         <dbl>      <dbl>       <dbl>       <dbl>        <dbl>
## 1 Alabama A & M University            100654 001002 00100200 1 [Public]                 18 [Master^s Colleges & Universities: Larger Programs] AL [Alabama] Normal     35762      12 [City: Midsize] 5 [Southeast AL AR FL GA KY LA MS NC SC TN VA WV]         10128         1414          20160          1414          NA         NA           NA          NA           NA          NA            NA           NA           1600          9240            3090            11542             21574             NA              NA              NA               NA        25472         35504         NA          NA          NA           NA
## 2 University of Alabama at Birmingham 100663 001052 00105200 1 [Public]                 15 [Doctoral Universities: Highest Research Activity]  AL [Alabama] Birmingham 35294-0110 12 [City: Midsize] 5 [Southeast AL AR FL GA KY LA MS NC SC TN VA WV]          8100            0          19188             0       28978          0        62714           0           NA          NA            NA           NA           1200         12307            5555             8100             19188          28978           62714              NA               NA        27162         38250      48040       81776          NA           NA
## 3 Amridge University                  100690 025034 02503400 2 [Private not-for-profit] 20 [Master^s Colleges & Universities: Small Programs]  AL [Alabama] Montgomery 36117-3553 12 [City: Midsize] 5 [Southeast AL AR FL GA KY LA MS NC SC TN VA WV]         11700         1300          11700          1300          NA         NA           NA          NA           NA          NA            NA           NA            900          9600            1600            13000             13000             NA              NA              NA               NA        25100         25100         NA          NA          NA           NA
## 4 University of Alabama in Huntsville 100706 001055 00105500 1 [Public]                 16 [Doctoral Universities: Higher Research Activity]   AL [Alabama] Huntsville 35899      12 [City: Midsize] 5 [Southeast AL AR FL GA KY LA MS NC SC TN VA WV]         10632          826          24430           826          NA         NA           NA          NA           NA          NA            NA           NA           2120         10400            3994            11458             25256             NA              NA              NA               NA        27972         41770         NA          NA          NA           NA
## 5 Alabama State University            100724 001005 00100500 1 [Public]                 19 [Master^s Colleges & Universities: Medium Programs] AL [Alabama] Montgomery 36104-0271 12 [City: Midsize] 5 [Southeast AL AR FL GA KY LA MS NC SC TN VA WV]          7416         2740          14832          2740          NA         NA           NA          NA           NA          NA            NA           NA           1600          7320            4228            10156             17572             NA              NA              NA               NA        23304         30720         NA          NA          NA           NA
## 6 The University of Alabama           100751 001051 00105100 1 [Public]                 16 [Doctoral Universities: Higher Research Activity]   AL [Alabama] Tuscaloosa 35487-0100 12 [City: Midsize] 5 [Southeast AL AR FL GA KY LA MS NC SC TN VA WV]         10780            0          30250             0       28978          0        62714           0        23610           0         43060            0           1000         13636            4600            10780             30250          28978           62714           23610            43060        30016         49486      48214       81950       42846        62296
df_generated_pop %>% head()
##   stdnorm_dist norm_dist lskew_dist rskew_dist
## 1  -1.38507062  43.07465  0.9172827 0.08271725
## 2   0.03832318  50.19162  0.7064826 0.29351743
## 3  -0.76303016  46.18485  0.8444117 0.15558827
## 4   0.21230614  51.06153  0.6694614 0.33053865
## 5   1.42553797  57.12769  0.3488487 0.65115131
## 6   0.74447982  53.72240  0.5401472 0.45985283

2 Plot population distribution

# Function to get sampling distribution (default: 1000 samples of size 200)
get_sampling_distribution <- function(data_vec, num_samples = 1000, sample_size = 200) {
  sample_means <- vector(mode = 'numeric', num_samples)

  for (i in 1:length(sample_means)) {
    samp <- sample(data_vec, sample_size)
    sample_means[[i]] <- mean(samp, na.rm = T)
  }

  sample_means
}

# Function to generate plot
plot_distribution <- function(data_vec1, data_vec2 = NULL, data_df = NULL, data_var = NULL, group_var = NULL, group_cat = NULL, pop_labels = NULL, show_group_hist = F, sampling_dist = F, plot_title = '') {
  
  two_pop <- !is.null(group_var) || !is.null(data_vec2)
  two_dist <- two_pop && !sampling_dist
  
  # Prep dataframe
  if (!is.null(data_df)) {
    data_df[[data_var]] <- unclass(data_df[[data_var]])  # unclass haven_labelled
    if (two_pop) {
      data_df[[group_var]] <- unclass(data_df[[group_var]])
    }
    
    data_df <- data_df %>% filter(!is.na(get(data_var)))  # remove NA rows
    
    # If group_cat not provided, use 2 values from group_var
    if (two_pop && is.null(group_cat)) {
      group_cat <- sort(unique(na.omit(data_df[[group_var]])))[1:2]
    }
    
    # Create population vector(s)
    if (!two_pop) {  # single population
      data_vec1 <- data_df[[data_var]]
    } else {  # two populations
      data_vec1 <- (data_df %>% filter(get(group_var) == group_cat[[1]]))[[data_var]]
      data_vec2 <- (data_df %>% filter(get(group_var) == group_cat[[2]]))[[data_var]]
    }
  }

  # Get sampling distribution
  if (sampling_dist) {
    if (!two_pop) {  # single population
      data_vec1 <- get_sampling_distribution(data_vec1)
    } else {  # two populations
      data_vec1_samp <- get_sampling_distribution(data_vec1)
      data_vec2_samp <- get_sampling_distribution(data_vec2)
      
      data_vec1 <- data_vec1_samp - data_vec2_samp
    }
  }
  
  # Legend text
  legend_text <- c(paste('Mean:', round(mean(data_vec1), 2),
                         '\nStd Dev:', round(sd(data_vec1), 2)),
                   paste('Median:', round(median(data_vec1), 2)))
  
  if (two_dist) {
    legend_text <- c(legend_text, 
                     paste('Mean:', round(mean(data_vec2), 2),
                           '\nStd Dev:', round(sd(data_vec2), 2)),
                     paste('Median:', round(median(data_vec2), 2)))
  }
  
  if (!is.null(pop_labels)) {
    pop1 <- pop_labels[[1]]
    pop2 <- pop_labels[[2]]
  } else if (!is.null(group_var)) {
    pop1 <- str_c(group_var, '=', group_cat[[1]])
    pop2 <- str_c(group_var, '=', group_cat[[2]])
  } else {
    pop1 <- 'Pop1'
    pop2 <- 'Pop2'
  }
  
  # Create statistics dataframe
  if (!two_dist) {
    lines_vec <- c('dotted')
    stats_vec <- c(mean(data_vec1), median(data_vec1))
    legend_title <- 'Statistics'
    
    if (two_pop && sampling_dist) {
      legend_title <- str_c('Statistics\n(', pop1, ' - ', pop2, ')')
    }
  } else {
    lines_vec <- c('dotted', 'dotdash')
    stats_vec <- c(mean(data_vec1), median(data_vec1), mean(data_vec2), median(data_vec2))
    legend_title <- str_c('Statistics\n(', pop1, ' vs. ', pop2, ')')
  }
  
  stats_df <- data.frame(
    pop = rep(lines_vec, each = 2),
    stat = rep(c('blue', 'red'), times = if_else(two_dist, 2, 1)),
    val = stats_vec
  )
  stats_df$pop <- factor(stats_df$pop, levels = c('dotted', 'dotdash'))
  
  # Plot distribution(s)
  p <- ggplot() +
    ggtitle(plot_title) + xlab('') + ylab('') +
    geom_density(aes(x = data_vec1), alpha = 0.8)
  
  if (!two_dist || show_group_hist) {  # show histogram only if 1 pop or show_group_hist is TRUE
    p <- p +
      geom_histogram(aes(x = data_vec1, y = ..density..), alpha = 0.4, position = 'identity')
  }
  
  if (two_dist) {
    p <- p +
      geom_density(aes(x = data_vec2), alpha = 0.8)
    
    if (show_group_hist) {  # show histogram only if show_group_hist is TRUE
      p <- p +
        geom_histogram(aes(x = data_vec2, y = ..density..), alpha = 0.4, position = 'identity', fill = 'wheat4')
    }
  }
  
  p <- p +
    geom_vline(data = stats_df,
               aes(xintercept = val, color = interaction(stat, pop), linetype = interaction(stat, pop)),
               size = 0.6, alpha = 0.8) +
    scale_color_manual(name = legend_title,
                       labels = legend_text,
                       values = as.character(stats_df$stat)) +
    scale_linetype_manual(name = legend_title,
                          labels = legend_text,
                          values = as.character(stats_df$pop)) +
    theme(plot.title = element_text(size = 10, face = 'bold', hjust = 0.5),
          legend.title = element_text(size = 9, face = 'bold'),
          legend.text = element_text(size = 8)) +
    guides(col = guide_legend(ncol = if_else(two_dist, 2, 1)))
  
  p
}

2.1 Single population

# IPEDS data: tuitfee_grad_res
plot_distribution(df_ipeds_pop$tuitfee_grad_res)

# IPEDS data: tuitfee_grad_res
plot_distribution(data_df = df_ipeds_pop, data_var = 'tuitfee_grad_res')

2.2 Two populations

# IPEDS data: tuitfee_grad_res by control (default: use first 2 categorical values in group_var)
plot_distribution(data_df = df_ipeds_pop, data_var = 'tuitfee_grad_res', group_var = 'control')

# IPEDS data: tuitfee_grad_res by control, shade histogram
plot_distribution(data_df = df_ipeds_pop, data_var = 'tuitfee_grad_res', group_var = 'control',
                  show_group_hist = T)

# IPEDS data: tuitfee_grad_res by control, custom order
plot_distribution(data_df = df_ipeds_pop, data_var = 'tuitfee_grad_res', group_var = 'control',
                  group_cat = c(2, 1))

# IPEDS data: tuitfee_grad_res by control, custom categories
plot_distribution(data_df = df_ipeds_pop, data_var = 'tuitfee_grad_res', group_var = 'control',
                  group_cat = c(2, 3))

# Alternatively, prepare 2 populations first to pass in
pop1 <- (df_ipeds_pop %>% filter(unclass(control) == 1))$tuitfee_grad_res
pop2 <- (df_ipeds_pop %>% filter(unclass(control) == 2))$tuitfee_grad_res

plot_distribution(pop1, pop2)

# Custom labels
plot_distribution(pop1, pop2, pop_labels = c('Public', 'Private not-for-profit'))

3 Plot single random sample

# Take single random sample of size 200
set.seed(124)
df_generated_sample <- df_generated_pop[sample(nrow(df_generated_pop), 200), ]

set.seed(124)
df_ipeds_sample <- df_ipeds_pop[sample(nrow(df_ipeds_pop), 200), ]

# Randomly generated data: standard normal
plot_distribution(df_generated_sample$stdnorm_dist)

4 Plot sampling distribution

4.1 Single population

# IPEDS data: tuitfee_grad_res
plot_distribution(df_ipeds_pop$tuitfee_grad_res, sampling_dist = T)

# IPEDS data: tuitfee_grad_res
plot_distribution(data_df = df_ipeds_pop, data_var = 'tuitfee_grad_res', sampling_dist = T)

4.2 Two populations

# IPEDS data: tuitfee_grad_res by control (control=1 minus control=2)
plot_distribution(data_df = df_ipeds_pop, data_var = 'tuitfee_grad_res', group_var = 'control',
                  sampling_dist = T, pop_labels = c('Public', 'Private'))

# IPEDS data: tuitfee_grad_res by control (control=2 minus control=1)
plot_distribution(data_df = df_ipeds_pop, data_var = 'tuitfee_grad_res', group_var = 'control',
                  group_cat = c(2, 1), sampling_dist = T, pop_labels = c('Private', 'Public'))

5 Hypothesis testing

# Function to generate t-distribution plot
plot_t_distribution <- function(data_vec1, data_vec2 = NULL, data_df = NULL, data_var = NULL, group_var = NULL, group_cat = NULL, mu = 0, alpha = 0.05, alternative = 'two.sided', plot_title = '', shade_rejection = T, shade_pval = T, stacked = F) {
  
  two_pop <- !is.null(group_var) || !is.null(data_vec2)
  
  # Prep dataframe
  if (!is.null(data_df)) {
    data_df[[data_var]] <- unclass(data_df[[data_var]])  # unclass haven_labelled
    if (two_pop) {
      data_df[[group_var]] <- unclass(data_df[[group_var]])
    }
    
    data_df <- data_df %>% filter(!is.na(get(data_var)))  # remove NA rows
    
    # If group_cat not provided, use 2 values from group_var
    if (two_pop && is.null(group_cat)) {
      group_cat <- sort(unique(na.omit(data_df[[group_var]])))[1:2]
    }
    
    # Create samples
    if (!two_pop) {  # single sample
      data_vec1 <- data_df[[data_var]]
    } else {  # two samples
      data_vec1 <- (data_df %>% filter(get(group_var) == group_cat[[1]]))[[data_var]]
      data_vec2 <- (data_df %>% filter(get(group_var) == group_cat[[2]]))[[data_var]]
    }
  }
  
  # Calculate stats
  if (!two_pop) {  # single sample
    data_vec1 <- na.omit(data_vec1)
    
    # Calculate t-statistics
    sample_size <- length(data_vec1)
    deg_freedom <- sample_size - 1
    xbar <- mean(data_vec1)
    s <- sd(data_vec1)
    
    std_err <- s / sqrt(sample_size)
    t <- (xbar - mu) / std_err
  } else {  # two samples
    data_vec1 <- na.omit(data_vec1)
    data_vec2 <- na.omit(data_vec2)
    
    # Calculate t-statistics
    xbar1 <- mean(data_vec1)
    xbar2 <- mean(data_vec2)
    s1 <- sd(data_vec1)
    s2 <- sd(data_vec2)
    n1 <- length(data_vec1)
    n2 <- length(data_vec2)
    
    deg_freedom <- (s1**2/n1 + s2**2/n2)**2 / ((s1**2/n1)**2/(n1-1) + (s2**2/n2)**2/(n2-1))
    std_err <- sqrt(s1**2/n1 + s2**2/n2)
    t <- (xbar1 - xbar2) / std_err
  }
  
  # Calculate critical value and p-value
  if (alternative == 'less') {  # left-tailed
    cv_lower <- qt(p = alpha, df = deg_freedom, lower.tail = T)
    cv_legend <- round(cv_lower, 2)
    cv_legend2 <- round(cv_lower * std_err + mu, 2)
    pval <- round(pt(q = t, df = deg_freedom, lower.tail = T), 4)
  } else if (alternative == 'greater') {  # right-tailed
    cv_upper <- qt(p = alpha, df = deg_freedom, lower.tail = F)
    cv_legend <- round(cv_upper, 2)
    cv_legend2 <- round(cv_upper * std_err + mu, 2)
    pval <- round(pt(q = t, df = deg_freedom, lower.tail = F), 4)
  } else {  # two-tailed
    cv_lower <- qt(p = alpha / 2, df = deg_freedom, lower.tail = T)
    cv_upper <- qt(p = alpha / 2, df = deg_freedom, lower.tail = F)
    cv_legend <- str_c('\u00B1', round(cv_upper, 2))
    cv_legend2 <- str_c(round(cv_lower * std_err + mu, 2), ' & ', round(cv_upper * std_err + mu, 2))
    pval_half <- round(pt(q = t, df = deg_freedom, lower.tail = t < 0), 4)
    pval <- str_c(pval_half, ' + ', pval_half, ' = ', 2 * pval_half)
  }
  
  # Plot t-distribution
  p <- ggplot(data.frame(x = -c(-4, 4)), aes(x)) +
    ggtitle(plot_title) + xlab('') + ylab('') +
    stat_function(fun = dt, args = list(df = deg_freedom), xlim = c(-4, 4))
  
  # Shade rejection region using critical value
  if (alternative != 'greater') {
    p <- p + geom_vline(aes(xintercept = cv_lower, color = 'cval'),
                        linetype = 'dotted', size = 0.8, alpha = 0.8)
    
    if (shade_rejection) {
      p <- p + stat_function(fun = dt, args = list(df = deg_freedom),
                             xlim = c(-4, cv_lower),
                             geom = 'area', alpha = 0.3, fill = 'red')
    }
    
    if (shade_pval) {
      p <- p + stat_function(fun = dt, args = list(df = deg_freedom),
                             xlim = c(-4, if_else(alternative == 'two.sided', -abs(t), t)),
                             geom = 'area', alpha = 0.3, fill = 'blue')
    }
  }
  if (alternative != 'less') {
    p <- p + geom_vline(aes(xintercept = cv_upper, color = 'cval'),
                        linetype = 'dotted', size = 0.8, alpha = 0.8)
    
    if (shade_rejection) {
      p <- p + stat_function(fun = dt, args = list(df = deg_freedom),
                             xlim = c(cv_upper, 4),
                             geom = 'area', alpha = 0.3, fill = 'red')
    }
    
    if (shade_pval) {
      p <- p + stat_function(fun = dt, args = list(df = deg_freedom),
                             xlim = c(if_else(alternative == 'two.sided', abs(t), t), 4),
                             geom = 'area', alpha = 0.3, fill = 'blue')
    }
  }
  
  # Legend text
  legend_text <- c('t-statistics / p-value', 'critical value / alpha')
  
  if (stacked) {
    legend_text <- c(str_c('t-statistics: ', round(t, 2),
                     '\n(p-value: ', str_extract(pval, '[\\d.-]+$'), ')'),
                     str_c('Critical value: ', cv_legend,
                     '\n(alpha: ', round(alpha, 2), ')'))
  }
  
  stats_text <- c(str_c('t-statistics: ', round(t, 2)),
                  str_c('SE: ', round(std_err, 2)),
                  str_c('p-value: ', pval),
                  str_c('Critical value: ', cv_legend),
                  str_c('alpha: ', round(alpha, 2)))
  
  if (!stacked) {
    p <- p +
      annotate('text', size = 9*5/14, x = 4.84, y = 0.14, hjust = 0,
               label = 'bold(Statistics)', parse = T) +
      annotate('text', size = 8*5/14, x = 4.89, y = 0:4 * -0.015 + 0.12, hjust = 0,
               label = stats_text)
  }
  
  # Label plot
  p <- p +
    geom_vline(aes(xintercept = t, color = 'tstat'),
               linetype = 'dotted', size = 0.8, alpha = 0.8) +
    scale_x_continuous(sec.axis = sec_axis(trans = ~ . * std_err + mu)) +
    scale_color_manual(name = if_else(stacked, 'Statistics', 'Legend'),
                       breaks = c('tstat', 'cval'),
                       labels = legend_text,
                       values = c(tstat = 'blue', cval = 'red')) +
    theme(plot.title = element_text(size = 10, face = 'bold', hjust = 0.5),
          plot.margin = unit(c(5.5, if_else(stacked, 5.5, 30), 5.5, 5.5), 'pt'),
          legend.title = element_text(size = 9, face = 'bold'),
          legend.text = element_text(size = 8)) +
    coord_cartesian(xlim = c(-4, 4),
                    clip = 'off')

  p
}

5.1 Single population

# H0: population mean of coa_grad_res is $29,000
# Ha: population mean of coa_grad_res is NOT $29,000 (two-sided test)
# Verdict: Fail to reject H0
t.test(df_ipeds_sample$coa_grad_res, mu = 29000)
## 
##  One Sample t-test
## 
## data:  df_ipeds_sample$coa_grad_res
## t = 1.2378, df = 199, p-value = 0.2173
## alternative hypothesis: true mean is not equal to 29000
## 95 percent confidence interval:
##  28405.20 31600.29
## sample estimates:
## mean of x 
##  30002.74
plot_t_distribution(df_ipeds_sample$coa_grad_res, mu = 29000)

# H0: population mean of coa_grad_res is $32,000
# Ha: population mean of coa_grad_res is NOT $32,000 (two-sided test)
# Verdict: Reject H0
t.test(df_ipeds_sample$coa_grad_res, mu = 32000)
## 
##  One Sample t-test
## 
## data:  df_ipeds_sample$coa_grad_res
## t = -2.4653, df = 199, p-value = 0.01454
## alternative hypothesis: true mean is not equal to 32000
## 95 percent confidence interval:
##  28405.20 31600.29
## sample estimates:
## mean of x 
##  30002.74
plot_t_distribution(data_df = df_ipeds_sample, data_var = 'coa_grad_res', mu = 32000)

# H0: population mean of coa_grad_res is $33,000
# Ha: population mean of coa_grad_res is LESS THAN $33,000 (left-sided test)
# Verdict: Reject H0
t.test(df_ipeds_sample$coa_grad_res, mu = 33000, alternative = 'less')
## 
##  One Sample t-test
## 
## data:  df_ipeds_sample$coa_grad_res
## t = -3.6997, df = 199, p-value = 0.0001396
## alternative hypothesis: true mean is less than 33000
## 95 percent confidence interval:
##      -Inf 31341.53
## sample estimates:
## mean of x 
##  30002.74
plot_t_distribution(df_ipeds_sample$coa_grad_res, mu = 33000, alternative = 'less')

# H0: population mean of coa_grad_res is $31,000
# Ha: population mean of coa_grad_res is GREATER THAN $31,000 (right-sided test)
# Verdict: Fail to reject H0
t.test(df_ipeds_sample$coa_grad_res, mu = 31000, alternative = 'greater')
## 
##  One Sample t-test
## 
## data:  df_ipeds_sample$coa_grad_res
## t = -1.231, df = 199, p-value = 0.8901
## alternative hypothesis: true mean is greater than 31000
## 95 percent confidence interval:
##  28663.96      Inf
## sample estimates:
## mean of x 
##  30002.74
plot_t_distribution(df_ipeds_sample$coa_grad_res, mu = 31000, alternative = 'greater')

5.2 Two populations

# H0: population mean of tuitfee_grad_res is equal for control=1 and control=2
# Ha: population means are NOT equal
# Verdict: Reject H0
t.test(formula = tuitfee_grad_res ~ control, data = df_ipeds_sample, 
       subset = unclass(control) %in% c(1, 2))
## 
##  Welch Two Sample t-test
## 
## data:  tuitfee_grad_res by control
## t = -6.7041, df = 116.65, p-value = 0.0000000007599
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -11132.95  -6055.28
## sample estimates:
## mean in group 1 mean in group 2 
##        10227.46        18821.58
# Here, the t-statistics line is off the grid
plot_t_distribution(data_df = df_ipeds_sample, data_var = 'tuitfee_grad_res',
                    group_var = 'control', group_cat = c(1, 2))

# Alternatively, prepare 2 samples first to pass in
sample1 <- (df_ipeds_sample %>% filter(unclass(control) == 1))$tuitfee_grad_res
sample2 <- (df_ipeds_sample %>% filter(unclass(control) == 2))$tuitfee_grad_res

t.test(sample1, sample2)
## 
##  Welch Two Sample t-test
## 
## data:  sample1 and sample2
## t = -6.7041, df = 116.65, p-value = 0.0000000007599
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -11132.95  -6055.28
## sample estimates:
## mean of x mean of y 
##  10227.46  18821.58
plot_t_distribution(sample1, sample2)

6 Combine plots

library(patchwork)

# Use / to place plots 1 per row (stacked)
plot_distribution(df_generated_pop$norm_dist, plot_title = 'Population distribution') /
  plot_distribution(df_generated_sample$norm_dist, plot_title = 'Single sample distribution') /
  plot_distribution(df_generated_pop$norm_dist, sampling_dist = T,
                    plot_title = 'Sampling distribution')

# Or use + and plot_layout()
plot_distribution(data_df = df_ipeds_pop, data_var = 'tuitfee_grad_res', group_var = 'control',
                  group_cat = c(2, 1), plot_title = 'Population distributions') +
  plot_distribution(data_df = df_ipeds_sample, data_var = 'tuitfee_grad_res', group_var = 'control',
                    group_cat = c(2, 1), plot_title = 'Single sample distributions') +
  plot_distribution(data_df = df_ipeds_pop, data_var = 'tuitfee_grad_res', group_var = 'control',
                    group_cat = c(2, 1), sampling_dist = T,
                    plot_title = 'Sampling distribution differences') +
  plot_layout(ncol = 1)

# When including t-distribution plot, need to specify stacked = T
plot_distribution(df_ipeds_sample$coa_grad_res, plot_title = 'Population distribution') +
  plot_distribution(df_ipeds_sample$coa_grad_res, plot_title = 'Single sample distribution') +
  plot_t_distribution(df_ipeds_sample$coa_grad_res, mu = 29000, stacked = T,
                      plot_title = 'Sampling distribution under null') +
  plot_layout(ncol = 1)